Start with the necessary packages and seed for the vignette

loadedPackages <- c("broom", "fields", "foreach", "geojsonio",
                    "ggmap", "maptools", "osmdata", "raster",
                    "rgeos", "sp", "sparr", "spatstat", "tidyverse")
invisible(lapply(loadedPackages, require, character.only = TRUE))
set.seed(1234) # for reproducibility

Source the necessary custom functions developed in sparrpowR

source(file = "R_functions/spatial_power.R") # spatial_power() function
source(file = "R_functions/spatial_plots.R") # spatial_plots() function

Import data from Open Data DC website

# Washington, D.C. boundary
gis_path1 <- "https://opendata.arcgis.com/datasets/7241f6d500b44288ad983f0942b39663_10.geojson"
dc <- geojson_read(gis_path1,  what = "sp")

# American Community Survey 2018 Census Tracts
gis_path2 <- "https://opendata.arcgis.com/datasets/faea4d66e7134e57bf8566197f25b3a8_0.geojson"
census <- geojson_read(gis_path2,  what = "sp")

We want to create a realistic boundary (i.e., polygon) of our study area. We are going to spatially clip our DC boundary by the census tracts in an attempt to remove major bodies of water where people do not reside.

clipwin <- maptools::unionSpatialPolygons(census, IDs = rep(1, length(census)))
dcc <- rgeos::gIntersection(dc, clipwin, byid = TRUE)
# Plot
par(mfrow=c(1,3))
plot(dc, main = "DC Boundary")
plot(census,  main = "American Community Survey\n2018")
plot(dcc, main = "Clipped Boundary")

Our developed method, sparrpowR, relies on the spatstat package to simulate data, which assumes point locations are on a planar (i.e., flat) surface. Our boundary is made up of geographical coordiantes on Earth (i.e., a sphere), so we need to flatten our boundary by spatially projecting it with an appropriate spatial reference system (CRS). For the District of Columbia, we are going to use the World Geodetic System 1984 (WGS84) Universal Transverse Mercator (UTM) Zone 18N EPSG:32619. We then convert the boundary into a owin object required by the spatstat package

dcp <- sp::spTransform(dcc, CRS("+init=EPSG:32618"))
dco <- spatstat::as.owin(dcp)

In this hypothetical example, we want to estimate the local power of detecting a spatial case cluster relative to control locations. Study participants that tested positive for a disease (i.e., cases) are hypothesized to be located in a circular area around the Navy Yard, an Environmental Protection Agency (EPA) Superfund Site (see the success story).

navy <- data.frame(lon = 326414.70444451, lat = 4304571.1539442)
spf <- SpatialPoints(coords = navy, proj4string = CRS("+init=EPSG:32618"))
# Plot
sp::plot(dcp, main = "Location of Hypothetical\nDisease Cluster")
sp::plot(spf, col = "red", add = T)
legend("right", legend = c("Navy Yard"), col = "red", pch = 3, bty = "n")

We will assume that 50 cases (e.g., n_case = 50) were uniformly distributed (e.g., samp_case = "uniform") within a circular radius with a radius of 2 kilometers (e.g., r_case = 2000).

If we were to conduct a study, where would we be sufficiently statistically powered to detect this spatial case cluster? To answer this question we will randomly sample locations (e.g., n_conrol = 950 or 5% disease prevalence) within the District of Columbia (e.g., samp_control = "uniform") as hypothetical study participants that would test negative for a disease (i.e., controls) were we to conduct a study. We can then resample control locations iteratively, as if we conducted the same study multiple times (e.g., sim_total = 10). We can conclude that we are sufficiently powered to detect a spatial case cluster in areas where a statistically significant (e.g., two-tailed alpha level of 0.05 or lower_tail = 0.025) spatial case cluster was located in at least 80% (e.g., p_thresh = 0.8) of these theoretical studies.

start_time <- Sys.time() # record start time
sim_power <- spatial_power(x_case = navy[[1]], y_case = navy[[2]], # center of cluster
                         n_case = 50, n_control = 950, # sample size of case/control
                         r_case = 2000, # size of case cluster (2 km)
                         samp_case = "uniform", samp_control = "uniform", # samplers
                         cascon = FALSE, # power for case cluster(s) only
                         lower_tail = 0.025, upper_tail = 0.975, # two-tailed alpha
                         sim_total = 10, # number of iterations
                         win = dco, # study area
                         resolution = 100, # number gridded knots on x-axis
                         edge = "diggle", # correct for edge effects
                         adapt = FALSE, # fixed-bandwidth
                         h0 = NULL, # automatically select bandwidth for each iteration
                         verbose = FALSE # no printout
                         )
end_time <- Sys.time() # record end time
time_srr <- end_time - start_time # Calculate run time

The process above took about 1.5 minutes to run. Of the 11 iterations we simulated an average of 950 (SD: 0) control locations with an average fixed bandwidth of 1.8 (SD: 0) kilometers.

cols <- c("cornflowerblue", "green", "red", "darkgreen", "blue") # colors
chars <- c(4,5) # symbols for point-locations
sizes <- c(0.5,0.5) # size of point-locations
p_thresh <- 0.8 # 80% of iterations with statistically significant results

## Data Visualizaiton of Input and Power
spatial_plots(input = sim_power, # use output of above simulation
              p_thresh = p_thresh, # power cut-off
              plot_pts = T, # default = TRUE
              chars = chars, # case, control
              sizes = sizes, # case, control
              cols = cols
)

Now, lets overlay our results ontop of a basemap. Here, we will use an open-source map from Stamen, that is unprojected in WGS84. We extract the rectagular box (i.e., bounding box) surrounding our polygon boundary of the District of Columbia (WGS84) and enlarge it using the custom function expandbb to the size of basemap we want to import.

expandbb <- function(bb, f) {
  x <- bb[3] - bb[1] # range of x values
  y <- bb[4] - bb[2] # range of y values
  nb <- bb # make a copy
  nb[1] <- bb[1] - (f * x) # xmin - left
  nb[3] <- bb[3] + (f * x) # xmax - right
  nb[2] <- bb[2] - (f * y) # ymin - bottom
  nb[4] <- bb[4] + (f * y) # ymax - top
  return(nb)}
dcbb <- expandbb(bb = bbox(dc), f = 0.01)
base_map <- ggmap::get_map(location = dcbb, maptype = "toner", source = "stamen")
plot(base_map, main = "Washington, D.C.")

Prepare the points from the first simulation for plotting in ggplot2 suite.

sim_pts <- sim_power$sim 
sim_pts <- as.SpatialPointsDataFrame.ppp(sim_pts)
crs(sim_pts) <- proj4string(dcp)
sim_pts_wgs84 <-  sp::spTransform(sim_pts, CRS("+init=epsg:4326"))
sim_pts_df <- tibble(data.frame(sim_pts_wgs84)) # convert to tidy data frame

Prepare the original boundary for plotting in ggplot2 suite.

dc_df <- tidy(dcc) # conver to a tidy dataframe
dcc$polyID <- sapply(slot(dcc, "polygons"), function(x) slot(x, "ID")) # preserve polygon id
dc_df <- merge(dc_df, dcc, by.x = "id", by.y="polyID") # merge data

Prepare the raster from the simulation for plotting in ggplot2 suite.

pvalprop <- tibble(x = sim_power$rx, y = sim_power$ry, z = sim_power$pval_prop) # extract output
lrr_narm <- na.omit(pvalprop) # remove NAs
coordinates(lrr_narm) <- ~ x + y # coordinates
gridded(lrr_narm) <- TRUE # gridded
pvalprop_raster <- raster::raster(lrr_narm) # convert to raster
rm(pvalprop, lrr_narm) # conserve memory
crs(pvalprop_raster) <- crs(dcp) # set output project (UTM 18N)
pvalprop_raster <- raster::projectRaster(pvalprop_raster, crs = crs(dc)) # unproject (WGS84) 
rtp <- rasterToPolygons(pvalprop_raster) # convert to polygons
rtp@data$id <- 1:nrow(rtp@data)   # add id column for join
rtpFort <- tidy(rtp, data = rtp@data) # convert to tibble
rtpFortMer <- merge(rtpFort, rtp@data, by.x = 'id', by.y = 'id')  # join data
rampcols <- grDevices::colorRampPalette(colors = c(cols[5], cols[2]), space="Lab")(length(raster::values(pvalprop_raster))) # set colorramp

Plot local power as a continuous outcome with point-locations

ggmap(base_map) + # basemap
  geom_polygon(data = dc_df, # original boundary
               aes(x = long, y = lat,
                   group = group),
               fill = "transparent",
               colour = "black") +
  geom_polygon(data = rtpFortMer, # output raster
               aes(x = long, y = lat, group = group, fill = z), 
               size = 0, 
               alpha = 0.5) +
  scale_fill_gradientn(colours = rampcols) + # colors for raster
  geom_point(data = sim_pts_df, # simulated point-locations
             aes(x = mx, y = my, color = marks, shape = marks),
             alpha = 0.8
             ) + 
  scale_color_manual(values = cols[4:5]) + # fill of point-locations
  scale_shape_manual(values = chars) + # shope of point-locations
  labs(x = "", y = "", fill = "Power", color = "", shape = "") # legend labels

Plot local power as a categorical outcome with point-locations

pvalprop_reclass <- raster::reclassify(pvalprop_raster, c(-Inf, p_thresh-0.0000001, 1,
                                                          p_thresh-0.0000001, Inf, 2)) # categorical outcome
rtp <- rasterToPolygons(pvalprop_reclass) # convert to polygons
rtp@data$id <- 1:nrow(rtp@data)   # add id column for join
rtpFort <- tidy(rtp, data = rtp@data) # convert to tibble
rtpFortMer <- merge(rtpFort, rtp@data, by.x = 'id', by.y = 'id')  # join data

ggmap(base_map) + # basemap
  geom_polygon(data = dc_df, # original boundary
               aes(x = long, y = lat,
                   group = group),
               fill = "transparent",
               colour = "black") +
  geom_polygon(data = rtpFortMer, # output raster
               aes(x = long, y = lat, group = group, fill = as.factor(z)), 
               size = 0, 
               alpha = 0.5) +
  scale_fill_manual(values = cols[c(5,2)], labels = c("insufficient", "sufficient")) + # colors for raster
  labs(x = "", y = "", fill = "Power") # legend labels

Based on 10 and uniform sampling of 950 control participants located within the District of Columbia, we are sufficiently powered to detect the disease cluster in the Navy Yard area.